library(data.table)
library(Rtsne)
library(R.utils)
library(ggplot2)
library(ggcorrplot)
library(umap)
theme_set(theme_bw())

Processing data

The original data files were downloaded from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE120575. They were split into two files to make sure the file size is smaller than 100MB, and the original file has a 2nd header line for time points: “Pre_P1”, “Post_P1”, “Post_P1_2”, and that row was deleted in file “GSE120575_Sade_Feldman_melanoma_single_cells_nolabel_1.txt.gz”.

fnm1 = "GSE120575_Sade_Feldman_melanoma_single_cells_nolabel_1.txt.gz"
fnm2 = "GSE120575_Sade_Feldman_melanoma_single_cells_2.txt.gz"
sf_tpm1 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm1), 
                fill = TRUE, header = TRUE)
sf_tpm2 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm2), 
                fill = TRUE, drop = 16293, col.names = colnames(sf_tpm1))

dim(sf_tpm1)
## [1] 14998 16292
dim(sf_tpm2)
## [1] 40739 16292
sf_tpm1[1:2,1:5]
sf_tpm2[1:2,1:5]
ls_tpm = list(sf_tpm1,sf_tpm2)
sf_tpm = rbindlist(ls_tpm)
rm(ls_tpm)
rm(sf_tpm1)
rm(sf_tpm2)
dim(sf_tpm) # 55737 x 16292
## [1] 55737 16292
sf_tpm[1:2,1:5]
cell.names = colnames(sf_tpm)[-1]
gene.names = sf_tpm$V1
rownames(sf_tpm) = gene.names
sf_tpm$V1 = NULL

Initial QC

How many transcripts per cell

Split the cells to reduce memory usage.

grouping = rep(1:82, each=200)[1:length(cell.names)]
reads.per.cell = NULL
for (i in 1:82) {
  ss = cell.names[grouping == i]
  temp.reads = unlist(sf_tpm[, lapply(.SD, sum), .SDcols = ss])
  reads.per.cell = c(reads.per.cell,temp.reads)
}
hist(reads.per.cell, main = "", xlab = "Summation of TPM per cell", breaks=50)

How many genes are expressed per cell

genes.per.cell = NULL
for (i in 1:82) {
  ss = cell.names[grouping == i]
  temp.counts = unlist(sf_tpm[, lapply(.SD, FUN = function(x) sum(x>0)), .SDcols = ss])
  genes.per.cell = c(genes.per.cell,temp.counts)
}
hist(genes.per.cell, main = "", xlab = "Number of expressed genes per cell", breaks=50)

rm(grouping)

How many cells each gene is expressed

gene.grouping = rep(1:28, each = 2000)[1:dim(sf_tpm)[1]]
cells.per.gene = NULL
for (i in 1:28) {
  temp.counts = rowSums(sf_tpm[gene.grouping==i,] > 0)
  cells.per.gene = c(cells.per.gene,temp.counts)
}
summary(cells.per.gene)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       5      30     704     374   16291
table(cells.per.gene == 0)
## 
## FALSE  TRUE 
## 50513  5224
table(cells.per.gene == 1)
## 
## FALSE  TRUE 
## 53323  2414
table(cells.per.gene <= 5)
## 
## FALSE  TRUE 
## 40941 14796
hist(log10(cells.per.gene), main = "", breaks=50, 
     xlab = "log10(number of cells) each gene is expressed")

rm(gene.grouping)

Choose high-variance genes

plot the variance of the expression levels for each gene against its mean expression level

# kept 12705 genes
large.express.genes.all = gene.names[which(cells.per.gene > 500)]
large.expressed.tpm = sf_tpm[match(large.express.genes.all,rownames(sf_tpm)),]

df_mv = data.frame(gene=large.express.genes.all, 
                   mean=rowMeans(large.expressed.tpm), 
                   var=apply(large.expressed.tpm, 1, var))
ggplot(data = df_mv) + 
  geom_point(aes(x=mean,y=var),size = 0.1)

rm(large.expressed.tpm)

Next we divide the genes into 20 equal-size bins based on their mean expression levels. In each bin, we select the first 50 genes with the highest variance

df_mv$grouping = cut_number(df_mv$mean, n=20)
table(df_mv$grouping)
## 
## [0.0206,0.149]  (0.149,0.195]  (0.195,0.235]  (0.235,0.277]  (0.277,0.323] 
##            636            635            635            635            636 
##  (0.323,0.378]  (0.378,0.436]  (0.436,0.504]  (0.504,0.576]  (0.576,0.666] 
##            635            635            635            635            636 
##  (0.666,0.769]  (0.769,0.886]   (0.886,1.02]    (1.02,1.18]    (1.18,1.38] 
##            635            635            635            635            636 
##    (1.38,1.64]    (1.64,1.98]    (1.98,2.57]    (2.57,3.72]    (3.72,16.4] 
##            635            635            635            635            636
hv.genes.list = by(df_mv, df_mv$grouping, 
                   FUN = function(df) return(df$gene[order(df$var,decreasing = TRUE)][1:50]))
hv.genes.all = as.character(unlist(hv.genes.list))
rm(hv.genes.list)
hvg_tpm = sf_tpm[match(hv.genes.all,rownames(sf_tpm)),]
hvg_tpm = t(hvg_tpm)
colnames(hvg_tpm) = hv.genes.all

dim(hvg_tpm)
## [1] 16291  1000
hvg_tpm[1:2,1:5]
##            ZNF829 PHLDB1 RP11-4F22.2 GBAP1 ULK2
## A10_P3_M11      0      0           0     0    0
## A11_P1_M11      0      0           0     0    0
rm(df_mv)

df_mv2 = data.frame(gene=hv.genes.all, mean=colMeans(hvg_tpm), 
                   var=apply(hvg_tpm, 2, var))
ggplot(data = df_mv2) + 
  geom_point(aes(x=mean,y=var),size = 0.1)

Perform PCA on selected genes

hvg_tpm = log(hvg_tpm + 1)
pca.hvg = princomp(hvg_tpm, cor = TRUE)

screen plot

eigvals = pca.hvg$sdev^2
pve = eigvals/sum(eigvals)
plot(1:50, pve[1:50], main = "scree plot (first 50 PCs)", type="b", 
     xlab="i-th PC", ylab="Proportion of Variance Explained")

plot of cumulative proportion of variance explained

yvec = cumsum(pve)
plot(yvec, type="l", main = "plot of cumulative proportion of variance", 
     xlab = "i-th Principal Components", 
     ylab = "Cumulative variance", ylim=c(0,1))
abline(h=0.8, lty = 2)

plot(yvec[1:100], type="l", main = "plot of cumulative proportion of variance", 
     xlab = "i-th Principal Components", 
     ylab = "Cumulative variance")

pca.scores = pca.hvg$scores
rm(pca.hvg)

Clustering on top 50 PCs

top50pcs = pca.scores[,1:50]
rm(pca.scores)

Calculate TSNE

set.seed(9999)
tsne = Rtsne(top50pcs, pca=FALSE)
df_tsne = as.data.frame(tsne$Y)
names(df_tsne) = paste0("topPC_TSNE", seq(ncol(tsne$Y)))
rm(tsne)

Calculate UMAP

pcs_umap = umap(top50pcs)
df_umap  = as.data.frame(pcs_umap$layout)
names(df_umap) = paste0("topPC_UMAP",seq(ncol(df_umap)))
rm(pcs_umap)

K-means clustering on top 50 PCs

Prepare dataframe for k-means clustering using top 50 PCs

df_top50PCs = cbind(top50pcs, df_tsne)
df_top50PCs = cbind(df_top50PCs, df_umap)

rm(df_tsne)
rm(df_umap)

dim(df_top50PCs)
## [1] 16291    54
df_top50PCs[1:2, c(1:2,48:52)]

Try Kmeans for 6 to 12 clusters

set.seed(9999)
all_num_clust = c(6:12)
for (num_clust in all_num_clust) {
  cat(paste0("KM with ",num_clust," clusters.\n"))
  kmeans_out = kmeans(top50pcs, centers = num_clust, iter.max = 1e3, 
                      nstart = 50, algorithm = "MacQueen")
  
  print(kmeans_out[c("betweenss","tot.withinss")])
  km_label = paste0("KM_",num_clust)
  df_top50PCs[[km_label]] = as.factor(kmeans_out$cluster)
  
  pt = ggplot(data=df_top50PCs) + 
    geom_point(aes(x=topPC_UMAP1, y=topPC_UMAP2, 
                   color=eval(as.name(paste(km_label)))), size = 0.1) + 
    scale_colour_discrete(name=paste(km_label)) + 
    guides(color = guide_legend(override.aes = list(size = 2)))
  print(pt)
  
  pt = ggplot(data=df_top50PCs) + 
    geom_point(aes(x=topPC_TSNE1, y=topPC_TSNE2, 
                   color=eval(as.name(paste(km_label)))), size = 0.1) + 
    scale_colour_discrete(name=paste(km_label)) + 
    guides(color = guide_legend(override.aes = list(size = 2)))
  print(pt)
  
  rm(kmeans_out)
  rm(pt)

}
## KM with 6 clusters.
## $betweenss
## [1] 1784958
## 
## $tot.withinss
## [1] 2031680

## KM with 7 clusters.
## $betweenss
## [1] 1883331
## 
## $tot.withinss
## [1] 1933306

## KM with 8 clusters.
## $betweenss
## [1] 1968328
## 
## $tot.withinss
## [1] 1848310

## KM with 9 clusters.
## $betweenss
## [1] 2038536
## 
## $tot.withinss
## [1] 1778102

## KM with 10 clusters.
## $betweenss
## [1] 2082016
## 
## $tot.withinss
## [1] 1734621

## KM with 11 clusters.
## $betweenss
## [1] 2117752
## 
## $tot.withinss
## [1] 1698885

## KM with 12 clusters.
## $betweenss
## [1] 2150178
## 
## $tot.withinss
## [1] 1666459

Compared to the clustering result from Sade-Feldman et al., 2018

xnm = "1-s2.0-S0092867418313941-mmc1.xlsx"
table_s1 = readxl::read_xlsx(paste0("../../_reference/Sade_Feldman_2018/", xnm), 
                             sheet = "Cluster annotation-Fig1B-C", col_names = TRUE)
dim(df_top50PCs)
## [1] 16291    61
dim(table_s1)
## [1] 16291     2
table_s1[1:2,]
table(table_s1$`Cluster number`)
## 
##    1    2    3    4    5    6    7    8    9   10   11 
## 1455  305 1391  290 2165 2222 1740 2165 1656 1773 1129
clabel = c("B_cells", "Plasma_cells", "Monocytes_Macrophages", "Dendritic_cells", 
           "Lymphocytes", "Exhausted_CD8T", "Tregs", "Cytotoxicity", 
           "Exhausted_HS_CD8T", "memory_T", "Lymphocytes_cell_cycle")
table_s1$cluster_label = clabel[table_s1$`Cluster number`]
dim(table_s1)
## [1] 16291     3
table_s1[1:2,]
table(rownames(df_top50PCs) == table_s1$`Cell Name`)
## 
## FALSE  TRUE 
##   991 15300
setequal(rownames(df_top50PCs), table_s1$`Cell Name`)
## [1] FALSE
cell_nm_check = cbind(rownames(df_top50PCs), table_s1$`Cell Name`)
w2check = which(cell_nm_check[,1] != cell_nm_check[,2])
cell_nm_check = cell_nm_check[w2check,]
dim(cell_nm_check)
## [1] 991   2
cell_nm_check[c(1:2,501:502, 990:991),]
##      [,1]                               [,2]                
## [1,] "A3_P7_MMD9_L001_myeloid_enriched" "A3_P7_MMD9_L001"   
## [2,] "A5_P7_MMD9_L001_myeloid_enriched" "A5_P7_MMD9_L001"   
## [3,] "E3_P12_MMD2-36B_T_enriched"       "E3_P12_MMD2-36B_DN"
## [4,] "E4_P12_MMD2-36B_T_enriched"       "E4_P12_MMD2-36B_DN"
## [5,] "H8_P5_M67_L001_T_enriched"        "H8_P5_M67_L001"    
## [6,] "H9_P5_M67_L001_T_enriched"        "H9_P5_M67_L001"
cell_nm_check[,1] = gsub("_myeloid_enriched", "", cell_nm_check[,1], fixed=TRUE)
cell_nm_check[,1] = gsub("_T_enriched", "", cell_nm_check[,1], fixed=TRUE)
cell_nm_check[,2] = gsub("_DN$", "",  cell_nm_check[,2], perl=TRUE)
cell_nm_check[,2] = gsub("_DN1$", "", cell_nm_check[,2], perl=TRUE)
cell_nm_check[,2] = gsub("_DP1$", "", cell_nm_check[,2], perl=TRUE)

table(cell_nm_check[,1] == cell_nm_check[,2])
## 
## TRUE 
##  991
df_top50PCs$sf_cluster = as.factor(table_s1$`Cluster number`)
df_top50PCs$sf_label   = as.factor(table_s1$cluster_label)

Cluster annotation from Sade-Feldman et al., 2018, 11 clusters

ggplot(data=df_top50PCs) + 
  geom_point(aes(x=topPC_UMAP1, y=topPC_UMAP2, color=sf_label), 
             size = 0.1) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

Cluster annotation from my Kmeans analysis, 12 clusters

ggplot(data=df_top50PCs) + 
  geom_point(aes(x=topPC_UMAP1, y=topPC_UMAP2, color=KM_12), size = 0.1) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

tb1 = table(df_top50PCs$KM_12, df_top50PCs$sf_label)
tb1
##     
##      B_cells Cytotoxicity Dendritic_cells Exhausted_CD8T Exhausted_HS_CD8T
##   1        1          667               0             13                11
##   2        1            1               0              0                 0
##   3        2            6               0             28                 8
##   4        0            0               2              1                 0
##   5        5           58               1             44                33
##   6        1         1048               0            549               897
##   7        1            1               0              0                 1
##   8        0            0             282              0                 0
##   9       15          167               5             43                20
##   10       2          215               0           1542               686
##   11       0            0               0              0                 0
##   12    1427            2               0              2                 0
##     
##      Lymphocytes Lymphocytes_cell_cycle memory_T Monocytes_Macrophages
##   1          244                      7       16                     0
##   2            1                      0        0                     0
##   3            4                    806        5                     7
##   4            1                      0        0                   603
##   5           32                     92      138                     5
##   6         1163                     20      130                     1
##   7            1                      0        0                   399
##   8            0                      0        0                    16
##   9          683                      5     1442                    31
##   10          35                    190       36                     0
##   11           0                      2        0                   329
##   12           1                      7        6                     0
##     
##      Plasma_cells Tregs
##   1             0     4
##   2           300     0
##   3             3    17
##   4             0     1
##   5             0  1271
##   6             0    33
##   7             0     1
##   8             0     0
##   9             1   395
##   10            0    14
##   11            0     0
##   12            1     4
rowSums(tb1 > 100)
##  1  2  3  4  5  6  7  8  9 10 11 12 
##  2  1  1  1  2  5  1  1  4  4  1  1
colSums(tb1 > 100)
##                B_cells           Cytotoxicity        Dendritic_cells 
##                      1                      4                      1 
##         Exhausted_CD8T      Exhausted_HS_CD8T            Lymphocytes 
##                      2                      2                      3 
## Lymphocytes_cell_cycle               memory_T  Monocytes_Macrophages 
##                      2                      3                      3 
##           Plasma_cells                  Tregs 
##                      1                      2

Select G1 B cells, G2 Plasma cells, G3 Monocytes/Macrophages, G4 Dendritic cells, G7 Regulatory T cells and G10 Memory T-cells by taking the largest intersection between the clusters defined in the paper and our clusters

cell_type = data.frame(cell = cell.names, type1 = NA)
for(ct1 in c("B_cells", "Dendritic_cells", "Monocytes_Macrophages", "Plasma_cells")){
  kmc = which(tb1[,ct1] > 100)
  cell_type$type1[which(df_top50PCs$KM_12 %in% kmc & df_top50PCs$sf_label==ct1)] = ct1
}

for(ct1 in c("Tregs", "memory_T")){
  kmc = which.max(tb1[,ct1])
  cell_type$type1[which(df_top50PCs$KM_12 %in% kmc & df_top50PCs$sf_label==ct1)] = ct1
}

table(cell_type$type1, df_top50PCs$KM_12)
##                        
##                            1    2    3    4    5    6    7    8    9   10   11
##   B_cells                  0    0    0    0    0    0    0    0    0    0    0
##   Dendritic_cells          0    0    0    0    0    0    0  282    0    0    0
##   memory_T                 0    0    0    0    0    0    0    0 1442    0    0
##   Monocytes_Macrophages    0    0    0  603    0    0  399    0    0    0  329
##   Plasma_cells             0  300    0    0    0    0    0    0    0    0    0
##   Tregs                    0    0    0    0 1271    0    0    0    0    0    0
##                        
##                           12
##   B_cells               1427
##   Dendritic_cells          0
##   memory_T                 0
##   Monocytes_Macrophages    0
##   Plasma_cells             0
##   Tregs                    0
rm(top50pcs)
rm(df_top50PCs)

Select CD8_B cells and CD8_G cells

  • 2482 CD8_B cells: 7
  • 3043 CD8_G cells: 8
# Cell cluster annotation of CD8+ T cells 
table_s2 = read.csv("output/CD8_cluster.csv", stringsAsFactors = FALSE)
dim(table_s2)
## [1] 6350    3
table_s2[1:2,]
table(table_s2$Cluster, table_s2$ruoyi_cluster, useNA="ifany")
##        
##         CD8_B CD8_G <NA>
##   CD8_B  2351     0  672
##   CD8_G     0  3046  281
CD8_B = table_s2$Cell.Name[which(table_s2$ruoyi_cluster=="CD8_B")]
CD8_G = table_s2$Cell.Name[which(table_s2$ruoyi_cluster=="CD8_G")]

table(CD8_B %in% cell_type$cell)
## 
## TRUE 
## 2351
table(CD8_G %in% cell_type$cell)
## 
## TRUE 
## 3046
cell_type$type2 = rep("others", nrow(cell_type))
cell_type$type2[match(CD8_B,cell_type$cell)] = "CD8T_B"
cell_type$type2[match(CD8_G,cell_type$cell)] = "CD8T_G"

table(cell_type$type1, cell_type$type2, useNA="ifany")
##                        
##                         CD8T_B CD8T_G others
##   B_cells                    0      0   1427
##   Dendritic_cells            1      0    281
##   memory_T                   2    341   1099
##   Monocytes_Macrophages      1      0   1330
##   Plasma_cells              11      5    284
##   Tregs                     14      5   1252
##   <NA>                    2322   2695   5221

Fine-clustering for NK cells

First, I get all the CD8 and NK cells, corresponding to the cells in clusters 5 (Lymphocytes), 6 (Exhausted_CD8T), 8 (Cytotoxicity), 9 (Exhausted_HS_CD8T) and 11 (Lymphocytes_cell_cycle) in Sade_Feldman et al. (2018). There are 9337 such cells. Then, I excluded the CD8 T cells using the annotation in the paper. 3513 cells remained after excluding the CD8 T cells.

Data for only CD8 and NK cells

table(paste(table_s1$`Cluster number`, table_s1$cluster_label))
## 
##                 1 B_cells               10 memory_T 11 Lymphocytes_cell_cycle 
##                      1455                      1773                      1129 
##            2 Plasma_cells   3 Monocytes_Macrophages         4 Dendritic_cells 
##                       305                      1391                       290 
##             5 Lymphocytes          6 Exhausted_CD8T                   7 Tregs 
##                      2165                      2222                      1740 
##            8 Cytotoxicity       9 Exhausted_HS_CD8T 
##                      2165                      1656
CD8.NK.cells = cell.names[as.numeric(table_s1$`Cluster number`) %in% c(5,6,8,9,11)]
TNK.cells    = CD8.NK.cells[! CD8.NK.cells %in% table_s2$Cell.Name] 
TNKcells_tpm = sf_tpm[, ..TNK.cells]

row.names(TNKcells_tpm) = gene.names

dim(TNKcells_tpm)
## [1] 55737  3513
TNKcells_tpm[1:2,1:5]

How many cells each gene is expressed

gene.grouping = rep(1:28, each = 2000)[1:dim(TNKcells_tpm)[1]]
cells.per.gene = NULL
for (i in 1:28) {
  temp.counts = apply(TNKcells_tpm[gene.grouping==i,], 1, FUN = function(x) sum(x>0))
  cells.per.gene = c(cells.per.gene,temp.counts)
}
table(cells.per.gene == 0)
## 
## FALSE  TRUE 
## 43842 11895
table(cells.per.gene == 1)
## 
## FALSE  TRUE 
## 49939  5798
hist(log10(cells.per.gene), main = "", breaks = 50, 
     xlab = "log10(number of cells) each gene is expressed")

rm(gene.grouping)

Choose high-variance genes

plot the variance of the expression levels for each gene against its mean expression level

# kept 10925 genes
genes2kp = which(cells.per.gene > 150)
large.express.genes.TNK = rownames(TNKcells_tpm)[genes2kp]
large.expressed.tpm = TNKcells_tpm[genes2kp,]
dim(large.expressed.tpm)
## [1] 10925  3513
df_mv = data.frame(gene=large.express.genes.TNK, 
                   mean=rowMeans(large.expressed.tpm), 
                   var=apply(large.expressed.tpm, 1, var))

rm(large.expressed.tpm)
ggplot(data = df_mv) + 
  geom_point(aes(x=mean,y=var),size = 0.1)

Next, divide the genes into 20 equal-size bins based on their mean expression levels, and select the first 50 genes with the highest variance from each bin.

df_mv$grouping = cut_number(df_mv$mean, n=20)
table(df_mv$grouping)
## 
## [0.0226,0.215]  (0.215,0.281]   (0.281,0.33]   (0.33,0.378]  (0.378,0.432] 
##            547            546            546            546            547 
##  (0.432,0.494]  (0.494,0.562]   (0.562,0.64]   (0.64,0.727]  (0.727,0.822] 
##            546            546            549            543            547 
##  (0.822,0.932]   (0.932,1.05]     (1.05,1.2]     (1.2,1.38]     (1.38,1.6] 
##            546            546            546            546            547 
##     (1.6,1.87]    (1.87,2.26]    (2.26,2.92]     (2.92,4.2]     (4.2,16.4] 
##            546            546            546            546            547
hv.genes.list = by(df_mv, df_mv$grouping, 
                   FUN = function(df) return(df$gene[order(df$var,decreasing = TRUE)][1:50]))
hv.genes.TNK = as.character(unlist(hv.genes.list))
rm(hv.genes.list)

hvg_TNKcells_tpm = TNKcells_tpm[match(hv.genes.TNK, rownames(TNKcells_tpm)),]
hvg_TNKcells_tpm = t(hvg_TNKcells_tpm)
colnames(hvg_TNKcells_tpm) = hv.genes.TNK
rm(df_mv)

dim(hvg_TNKcells_tpm)
## [1] 3513 1000
hvg_TNKcells_tpm[1:2,1:4]
##            ZNF816 AP000476.1 ZNF549 MLTK
## A10_P3_M11      0          0      0    0
## A11_P1_M11      0          0      0    0
df_mv2 = data.frame(gene=hv.genes.TNK, 
                   mean=colMeans(hvg_TNKcells_tpm), 
                   var=apply(hvg_TNKcells_tpm, 2, var))

ggplot(data = df_mv2) + 
  geom_point(aes(x=mean,y=var),size = 0.1)

Perform PCA on selected genes

pca.hvg = princomp(log(hvg_TNKcells_tpm+1), cor = TRUE)
rm(hvg_TNKcells_tpm)
pca.scores = pca.hvg$scores
rm(pca.hvg)

Clustering on top 50 PCs

top50pcs_TNKcells = pca.scores[,1:50]
rm(pca.scores)

Calculate UMAP

pcs_umap = umap(top50pcs_TNKcells)
df_umap  = as.data.frame(pcs_umap$layout)
names(df_umap) = paste0("topPC_UMAP",seq(ncol(df_umap)))
rm(pcs_umap)

K-means clustering on top 50 PCs

Prepare dataframe for k-means clustering using top 50 PCs

df_top50PCs_TNKcells = cbind(top50pcs_TNKcells, df_umap)
rm(df_umap)
dim(df_top50PCs_TNKcells)
## [1] 3513   52
df_top50PCs_TNKcells[1:2,c(1:2,50:52)]
set.seed(9999)
all_num_clust = c(2:6)
for (num_clust in all_num_clust) {
  cat(paste0("KM with ",num_clust," clusters.\n"))
  kmeans_out = kmeans(top50pcs_TNKcells, centers = num_clust, 
                      iter.max = 1e3, 
                      nstart = 50, algorithm = "MacQueen")
  print(kmeans_out[c("betweenss","tot.withinss")])
  km_label = paste0("KM_",num_clust)
  df_top50PCs_TNKcells[[km_label]] = as.factor(kmeans_out$cluster)
  pt = ggplot(data=df_top50PCs_TNKcells) + 
    geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,
                   color=eval(as.name(paste(km_label)))), size = 0.1) + 
    scale_colour_discrete(name=paste(km_label)) + 
    guides(color = guide_legend(override.aes = list(size = 2)))
  print(pt)
  rm(kmeans_out)
  rm(pt)
}
## KM with 2 clusters.
## $betweenss
## [1] 168403.8
## 
## $tot.withinss
## [1] 573296.8

## KM with 3 clusters.
## $betweenss
## [1] 206432.2
## 
## $tot.withinss
## [1] 535268.5

## KM with 4 clusters.
## $betweenss
## [1] 231524.6
## 
## $tot.withinss
## [1] 510176

## KM with 5 clusters.
## $betweenss
## [1] 246807.8
## 
## $tot.withinss
## [1] 494892.9

## KM with 6 clusters.
## $betweenss
## [1] 259523.3
## 
## $tot.withinss
## [1] 482177.3

Cell cluster annotation from Sade-Feldman et al., 2018

df_top50PCs_TNKcells$sf_cluster = 
  as.factor(table_s1$cluster_label[match(TNK.cells, cell.names)])
ggplot(data=df_top50PCs_TNKcells) + 
  geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=sf_cluster),size = 0.1) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

The clustering results from k-means analysis using 3-6 clusters are generally very similar: G5-Lymphocytes and G8-Cytotoxicity form one cluster, G6-Exhausted_CD8T and G9-Exhausted_HS_CD8T form one cluster, and G11-Lymphocytes_cell_cycle forms an individual cluster. Using 3 clusters would be suffice.

Obtain the labeling of NK cells

NK1 = row.names(df_top50PCs_TNKcells)[df_top50PCs_TNKcells$KM_3==1]
NK2 = row.names(df_top50PCs_TNKcells)[df_top50PCs_TNKcells$KM_3==2]
NK3 = row.names(df_top50PCs_TNKcells)[df_top50PCs_TNKcells$KM_3==3]
cell_type$type2[match(NK1,cell_type$cell)] = "NK1"
cell_type$type2[match(NK2,cell_type$cell)] = "NK2"
cell_type$type2[match(NK3,cell_type$cell)] = "NK3"

dim(cell_type)
## [1] 16291     3
cell_type[1:2,]
table(cell_type$type1, useNA = "ifany")
## 
##               B_cells       Dendritic_cells              memory_T 
##                  1427                   282                  1442 
## Monocytes_Macrophages          Plasma_cells                 Tregs 
##                  1331                   300                  1271 
##                  <NA> 
##                 10238
table(cell_type$type2, useNA = "ifany")
## 
## CD8T_B CD8T_G    NK1    NK2    NK3 others 
##   2351   3046    456   1961   1096   7381
table(cell_type$type1, cell_type$type2, useNA = "ifany")
##                        
##                         CD8T_B CD8T_G  NK1  NK2  NK3 others
##   B_cells                    0      0    0    0    0   1427
##   Dendritic_cells            1      0    0    0    0    281
##   memory_T                   2    341    0    0    0   1099
##   Monocytes_Macrophages      1      0    0    0    0   1330
##   Plasma_cells              11      5    0    0    0    284
##   Tregs                     14      5    0    0    0   1252
##   <NA>                    2322   2695  456 1961 1096   1708

Merge the cell type definition from the Kmeans clustering of all the cells (cell_type$type1) and the Kmean clustering of the CD8+ T cells and NK cells (cell_type$type2).

cell_type$type1[which(is.na(cell_type$type1))] = "unclustered"

# remove the cluster membership for any cells classified as CD8T or NK
cell_type$type = cell_type$type1
cell_type$type[which(cell_type$type2 != "others")] = "unclustered"

# now only CD4+ T cells are left for T memory cells
cell_type$type[which(cell_type$type == "memory_T")] = "CD4T_memory"

# add labels of CD8+ T and NK cells
ct_TNK = (cell_type$type1 == "unclustered")
cell_type$type[which(ct_TNK & cell_type$type2 == "CD8T_B")] = "CD8T_B"

ct_TmemNK = (cell_type$type1 %in% c("memory_T", "unclustered"))
cell_type$type[which(ct_TmemNK & cell_type$type2 == "CD8T_G")] = "CD8T_G"

cell_type$type[which(cell_type$type2 == "NK1")] = "NK1"
cell_type$type[which(cell_type$type2 == "NK2")] = "NK2"
cell_type$type[which(cell_type$type2 == "NK3")] = "NK3"

table(cell_type$type, cell_type$type1, useNA = "ifany")
##                        
##                         B_cells Dendritic_cells memory_T Monocytes_Macrophages
##   B_cells                  1427               0        0                     0
##   CD4T_memory                 0               0     1099                     0
##   CD8T_B                      0               0        0                     0
##   CD8T_G                      0               0      341                     0
##   Dendritic_cells             0             281        0                     0
##   Monocytes_Macrophages       0               0        0                  1330
##   NK1                         0               0        0                     0
##   NK2                         0               0        0                     0
##   NK3                         0               0        0                     0
##   Plasma_cells                0               0        0                     0
##   Tregs                       0               0        0                     0
##   unclustered                 0               1        2                     1
##                        
##                         Plasma_cells Tregs unclustered
##   B_cells                          0     0           0
##   CD4T_memory                      0     0           0
##   CD8T_B                           0     0        2322
##   CD8T_G                           0     0        2695
##   Dendritic_cells                  0     0           0
##   Monocytes_Macrophages            0     0           0
##   NK1                              0     0         456
##   NK2                              0     0        1961
##   NK3                              0     0        1096
##   Plasma_cells                   284     0           0
##   Tregs                            0  1252           0
##   unclustered                     16    19        1708
table(cell_type$type, cell_type$type2, useNA = "ifany")
##                        
##                         CD8T_B CD8T_G  NK1  NK2  NK3 others
##   B_cells                    0      0    0    0    0   1427
##   CD4T_memory                0      0    0    0    0   1099
##   CD8T_B                  2322      0    0    0    0      0
##   CD8T_G                     0   3036    0    0    0      0
##   Dendritic_cells            0      0    0    0    0    281
##   Monocytes_Macrophages      0      0    0    0    0   1330
##   NK1                        0      0  456    0    0      0
##   NK2                        0      0    0 1961    0      0
##   NK3                        0      0    0    0 1096      0
##   Plasma_cells               0      0    0    0    0    284
##   Tregs                      0      0    0    0    0   1252
##   unclustered               29     10    0    0    0   1708

illustrate the results for a few genes

cluster_ct = cell_type$type[match(names(sf_tpm), cell_type$cell)]

plot1 <- function(gene1, edata, cluster_ct){
  y      = as.numeric(edata[which(rownames(edata) == gene1),])
  df1    = data.frame(expression = y, cell_type = cluster_ct)
  
  g1 = ggplot(df1, aes(x=cell_type, y=expression, fill=cell_type)) +
    geom_boxplot() + coord_flip() + theme(legend.position = "none") +
    ggtitle(gene1)
  print(g1)
}

plot2 <- function(gene1, gene2, edata, cluster_ct, cluster2use){
  y1     = as.numeric(edata[which(rownames(edata) == gene1),])
  y2     = as.numeric(edata[which(rownames(edata) == gene2),])
  df1    = data.frame(y1=y1, y2=y2, cell_type = cluster_ct)
  df1    = df1[which(df1$cell_type %in% cluster2use),]
  g1 = ggplot(df1, aes(x=y1, y=y2, color=cell_type)) +
    geom_point(size=0.4) + xlab(gene1) + ylab(gene2)
  print(g1)
}


plot1("FOXP3", sf_tpm, cluster_ct)

plot1("GZMA",  sf_tpm, cluster_ct)

plot1("PDCD1", sf_tpm, cluster_ct)

plot1("CTLA4", sf_tpm, cluster_ct)

plot1("LAG3",  sf_tpm, cluster_ct)

plot1("CD3E",  sf_tpm, cluster_ct)

plot1("CD4",   sf_tpm, cluster_ct)

plot1("CD8A",  sf_tpm, cluster_ct)

plot1("CD8B",  sf_tpm, cluster_ct)

plot1("CD33",  sf_tpm, cluster_ct)

plot1("CD14",  sf_tpm, cluster_ct)

plot1("CD19",  sf_tpm, cluster_ct)

plot1("CD8A",  sf_tpm, cluster_ct)

plot1("CD8B",  sf_tpm, cluster_ct)

cluster2use = c("CD8T_B", "CD8T_G", "NK1", "NK2", "NK3")
plot2("CD8A", "CD8B", sf_tpm, cluster_ct, cluster2use)

# CD56
plot1("NCAM1",  sf_tpm, cluster_ct)

 # CD16
plot1("FCGR3A", sf_tpm, cluster_ct)

# NKP46
plot1("NCR1",   sf_tpm, cluster_ct)

Next check 13 marker genes from > Crinier, Adeline, et al. “High-dimensional single-cell analysis identifies organ-specific signatures and conserved NK cell subsets in humans and mice.” Immunity 49.5 (2018): 971-986.

plot1("CD160",   sf_tpm, cluster_ct)

plot1("CD244",   sf_tpm, cluster_ct)

plot1("CHST12",  sf_tpm, cluster_ct)

plot1("CST7",    sf_tpm, cluster_ct)

plot1("GNLY",    sf_tpm, cluster_ct)

plot1("IL18RAP", sf_tpm, cluster_ct)

plot1("IL2RB",   sf_tpm, cluster_ct)

# NKG2A
plot1("KLRC1",   sf_tpm, cluster_ct)

# NKG2E
plot1("KLRC3",   sf_tpm, cluster_ct)

# CD94
plot1("KLRD1",   sf_tpm, cluster_ct)

# NKp80
plot1("KLRF1",   sf_tpm, cluster_ct)

plot1("PRF1",    sf_tpm, cluster_ct)

plot1("XCL2",    sf_tpm, cluster_ct)

plot2("GNLY", "CD8A",  sf_tpm, cluster_ct, cluster2use)

Next additional marker genes from > Freud, Aharon G., et al. “The broad spectrum of human natural killer cell diversity.” Immunity 47.5 (2017): 820-833.

# NKG2C
plot1("KLRC2",   sf_tpm, cluster_ct)

# NKG2D
plot1("KLRK1",   sf_tpm, cluster_ct)

plot2("KLRK1", "CD8A",  sf_tpm, cluster_ct, cluster2use)

plot1("EOMES",   sf_tpm, cluster_ct)

plot2("EOMES", "CD8A",  sf_tpm, cluster_ct, cluster2use)

# T-BET
plot1("TBX21",   sf_tpm, cluster_ct)

# KIR genes
plot1("KIR3DL1",   sf_tpm, cluster_ct)

plot1("KIR3DL2",   sf_tpm, cluster_ct)

plot1("KIR3DL3",   sf_tpm, cluster_ct)

plot1("KIR3DP1",   sf_tpm, cluster_ct)

Save the current labeling of the cells that we obtained from the K-means analysis

table(cell_type$type1[cell_type$type2=="others"])
## 
##               B_cells       Dendritic_cells              memory_T 
##                  1427                   281                  1099 
## Monocytes_Macrophages          Plasma_cells                 Tregs 
##                  1330                   284                  1252 
##           unclustered 
##                  1708
write.table(cell_type, "output/cell_type.txt", quote = FALSE, 
            col.names = TRUE, row.names = FALSE, 
            na = "unclustered", sep = "\t")

Conclusion

gc()
##              used   (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells    2160295  115.4    5734478   306.3         NA    5734478   306.3
## Vcells 1216409633 9280.5 2129235534 16244.8      32768 2129214556 16244.7
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] umap_0.2.7.0      ggcorrplot_0.1.3  ggplot2_3.3.1     R.utils_2.9.2    
## [5] R.oo_1.23.0       R.methodsS3_1.8.0 Rtsne_0.15        data.table_1.12.8
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3       RSpectra_0.16-0  cellranger_1.1.0 compiler_3.6.2  
##  [5] pillar_1.4.3     tools_3.6.2      digest_0.6.23    jsonlite_1.6.1  
##  [9] lattice_0.20-38  evaluate_0.14    lifecycle_0.2.0  tibble_3.0.1    
## [13] gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.6      Matrix_1.2-18   
## [17] yaml_2.2.1       xfun_0.12        withr_2.1.2      stringr_1.4.0   
## [21] dplyr_0.8.4      knitr_1.28       askpass_1.1      vctrs_0.3.0     
## [25] grid_3.6.2       tidyselect_1.0.0 reticulate_1.15  glue_1.3.1      
## [29] R6_2.4.1         readxl_1.3.1     rmarkdown_2.1    farver_2.0.3    
## [33] purrr_0.3.3      magrittr_1.5     scales_1.1.0     htmltools_0.4.0 
## [37] ellipsis_0.3.0   assertthat_0.2.1 colorspace_1.4-1 labeling_0.3    
## [41] stringi_1.4.5    openssl_1.4.1    munsell_0.5.0    crayon_1.3.4